Mounting Google Drive
#from google.colab import drive
#drive.mount('/content/drive/')
Importing Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import time, os, sys, itertools, re
from PIL import Image
import warnings, pickle, string
from dateutil import parser
%matplotlib inline
# Data Visualization
import cufflinks as cf
import plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs,init_notebook_mode,plot,iplot
from ftfy import fix_text, badness
# Traditional Modeling
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
# Tools & Evaluation metrics
from sklearn.metrics import confusion_matrix, classification_report, auc
from sklearn.metrics import roc_curve, accuracy_score, precision_recall_curve
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
Reading the data from excel
#data=pd.read_excel('/content/drive/MyDrive/Capstone/input_data.xlsx')
data=pd.read_excel('input_data.xlsx')
data.info()
data.head()
assignment_group_count=data['Assignment group'].value_counts()
assignment_group_count.describe()
plt.subplots(figsize=(50,10))
ax=sns.countplot(x='Assignment group', data=data)
ax.set_xticklabels(ax.get_xticklabels(), rotation=30)
plt.tight_layout
plt.show()
assignment_group_count.head(50)
assignment_group_count.tail(24)
Check Missing Values in dataframe
data.isnull().sum()
data[data["Short description"].isnull()]
data[data["Description"].isnull()]=data["Short description"]
data[data["Description"].isnull()]
data['Short description'] = data['Short description'].replace(np.nan, '', regex=True)
data.isnull().sum()
#df_rules = pd.read_csv('/content/drive/MyDrive/Capstone/Rule_matrix.csv')
df_rules = pd.read_csv("Rule_matrix.csv")
def applyRules(datadf,rulesdf,Description,ShortDescription):
datadf['pred_group'] = np.nan
for i, row in rulesdf.iterrows():
for j, row in datadf.iterrows():
if pd.notna(datadf[ShortDescription][j]):
if (('erp' in datadf[ShortDescription][j]) and (('EU_tool' in datadf[ShortDescription][j]))):
datadf['pred_group'][j] = 'GRP_25'
for j, row in datadf.iterrows():
if pd.notna(datadf[Description][j]):
if (datadf[Description][j] == 'the'):
datadf['pred_group'][j] = 'GRP_17'
if (('finance_app' in datadf[ShortDescription][j]) and ('HostName_1132' not in datadf[ShortDescription][j])):
datadf['pred_group'][j] = 'GRP_55'
if (('processor' in datadf[Description][j]) and ('engg' in datadf[Description][j])):
datadf['pred_group'][j] = 'GRP_58'
if rulesdf['Short Desc Rule'][i] == 'begins with' and rulesdf['Desc Rule'][i] == 'begins with' and pd.isna(rulesdf['User'][i]):
for j, row in datadf.iterrows():
if pd.notna(datadf[ShortDescription][j]) and pd.notna(datadf[Description][j]):
if ((datadf[ShortDescription][j].startswith(rulesdf['Short Dec Keyword'][i])) and (datadf[Description][j].startswith(rulesdf['Dec keyword'][i]))):
datadf['pred_group'][j] = rulesdf['Group'][i]
if pd.isna(rulesdf['Short Desc Rule'][i]) and rulesdf['Desc Rule'][i] == 'begins with' and pd.notna(rulesdf['User'][i]):
for j, row in datadf.iterrows():
if pd.notna(datadf[Description][j]) and pd.notna(datadf['Caller'][j]):
if ((datadf[Description][j].startswith(rulesdf['Desc Rule'][i]) and (rulesdf['User'][i] == datadf['Caller'][j]))):
datadf['pred_group'][j] = rulesdf['Group'][i]
if rulesdf['Short Desc Rule'][i] == 'contains' and pd.notna(rulesdf['User'][i]):
for j, row in datadf.iterrows():
if (pd.notna(datadf[ShortDescription][j]) and pd.notna(datadf['Caller'][j])):
if ((rulesdf['Short Dec Keyword'][i] in datadf[ShortDescription][j]) and (rulesdf['User'][i] == datadf['Caller'][j])):
datadf['pred_group'][j] = rulesdf['Group'][i]
if rulesdf['Short Desc Rule'][i] == 'contains' and pd.isna(rulesdf['Desc Rule'][i]) and pd.isna(rulesdf['User'][i]):
for j, row in datadf.iterrows():
#print(j)
if pd.notna(datadf[ShortDescription][j]):
if (rulesdf['Short Dec Keyword'][i] in datadf[ShortDescription][j]):
datadf['pred_group'][j] = rulesdf['Group'][i]
if pd.isna(rulesdf['Short Desc Rule'][i]) and rulesdf['Desc Rule'][i] == 'begins with' and pd.isna(rulesdf['User'][i]):
for j, row in datadf.iterrows():
if pd.notna(datadf[Description][j]):
if (datadf[Description][j].startswith(rulesdf['Dec keyword'][i])):
datadf['pred_group'][j] = rulesdf['Group'][i]
if pd.isna(rulesdf['Short Desc Rule'][i]) and rulesdf['Desc Rule'][i] == 'contains' and pd.isna(rulesdf['User'][i]):
for j, row in datadf.iterrows():
if pd.notna(datadf[Description][j]):
if (rulesdf['Dec keyword'][i] in datadf[Description][j]):
datadf['pred_group'][j] = rulesdf['Group'][i]
return datadf
rules_applied_df = applyRules(data,df_rules,'Description','Short description')
rules_applied_df
rules_applied_df.info()
rules_applied_df = rules_applied_df[(rules_applied_df['pred_group'].isna())]
rules_applied_df.info()
assignment_group_count=rules_applied_df['Assignment group'].value_counts()
assignment_group_count.describe()
#Concatenate Short Description and Description columns
rules_applied_df['New Description'] = rules_applied_df['Description'] + ' ' +rules_applied_df['Short description']
clean_data=rules_applied_df.drop(['Short description', 'Description', 'pred_group'], axis=1)
clean_data.info()
# Write a function to apply to the dataset to detect Mojibakes
def is_mojibake_impacted(text):
if not badness.sequence_weirdness(text):
# nothing weird, should be okay
return True
try:
text.encode('sloppy-windows-1252')
except UnicodeEncodeError:
# Not CP-1252 encodable, probably fine
return True
else:
# Encodable as CP-1252, Mojibake alert level high
return False
# Check the dataset for mojibake impact
clean_data[~clean_data.iloc[:,:].applymap(is_mojibake_impacted).all(1)]
# Take an example of row# 8471 Short Desc and fix it
print('Grabled text: \033[1m%s\033[0m\nFixed text: \033[1m%s\033[0m' % (clean_data['New Description'][8471],
fix_text(clean_data['New Description'][8471])))
# List all mojibakes defined in ftfy library
print('\nMojibake Symbol RegEx:\n', badness.MOJIBAKE_SYMBOL_RE.pattern)
# Sanitize the dataset from Mojibakes
clean_data['New Description'] = clean_data['New Description'].apply(fix_text)
# Visualize that row# 8471
clean_data.loc[8471]
def date_validity(date_str):
try:
parser.parse(date_str)
return True
except:
return False
def process(text_string):
text=text_string.lower()
text_string = ' '.join([w for w in text_string.split() if not date_validity(w)])
text_string = re.sub(r"received from:",'',text_string)
text_string = re.sub(r"from:",' ',text_string)
text_string = re.sub(r"to:",' ',text_string)
text_string = re.sub(r"subject:",' ',text_string)
text_string = re.sub(r"sent:",' ',text_string)
text_string = re.sub(r"ic:",' ',text_string)
text_string = re.sub(r"cc:",' ',text_string)
text_string = re.sub(r"bcc:",' ',text_string)
text_string = re.sub(r'\S*@\S*\s?', '', text_string)
text_string = re.sub(r'\d+','' ,text_string)
text_string = re.sub(r'\n',' ',text_string)
text_string = re.sub(r'#','', text_string)
text_string = re.sub(r'&;?', 'and',text_string)
text_string = re.sub(r'\&\w*;', '', text_string)
text_string = re.sub(r'https?:\/\/.*\/\w*', '', text_string)
text_string= ''.join(c for c in text_string if c <= '\uFFFF')
text_string = text_string.strip()
text_string = ' '.join(re.sub("[^\u0030-\u0039\u0041-\u005a\u0061-\u007a]", " ", text_string).split())
text_string = re.sub(r"\s+[a-zA-Z]\s+", ' ', text_string)
text_string = re.sub(' +', ' ', text_string)
text_string = text_string.strip()
return text_string
clean_data["Clean_Description"] = clean_data["New Description"].apply(process)
clean_data
!pip install langdetect
from langdetect import detect
def fn_lang_detect(df):
try:
return detect(df)
except:
return 'no'
clean_data['language'] = clean_data['Clean_Description'].apply(fn_lang_detect)
x = clean_data["language"].value_counts()
x=x.sort_index()
plt.figure(figsize=(10,6))
ax= sns.barplot(x.index, x.values, alpha=0.8)
plt.title("Distribution of text by language")
plt.ylabel('number of records')
plt.xlabel('Language')
rects = ax.patches
labels = x.values
for rect, label in zip(rects, labels):
height = rect.get_height()
ax.text(rect.get_x() + rect.get_width()/2, height + 5, label, ha='center', va='bottom')
plt.show();
We can see that most of the tickets are in english, followed by tickets in German language. We need to translate these into english.
#german_data = pd.read_csv("/content/drive/MyDrive/Capstone/german.csv")
german_data = pd.read_csv('german.csv')
german_data
german_dictionary = german_data.to_dict(orient='records')
def translate_function(text):
translated_text = []
text_split = text.split()
for text in text_split:
word_found = False
for item in range(len(german_dictionary)):
if text == german_dictionary[item]["German"]:
translated_text.append(german_dictionary[item]["English"])
word_found = True
if word_found == False:
translated_text.append(text)
translate = ' '.join([word for word in translated_text])
return translate
clean_data["Translated Text"] = clean_data["Clean_Description"].apply(translate_function)
clean_data.tail(10)
clean_data[clean_data.language == 'de']
!pip3 install nltk
import nltk
nltk.download('wordnet')
nltk.download('punkt')
from nltk.corpus import wordnet
from collections import OrderedDict
from nltk.tokenize import word_tokenize
def find_synonyms(word):
synonyms = []
for synset in wordnet.synsets(word):
for syn in synset.lemma_names():
synonyms.append(syn)
# using this to drop duplicates while maintaining word order (closest synonyms comes first)
synonyms_without_duplicates = list(OrderedDict.fromkeys(synonyms))
return synonyms_without_duplicates
def create_set_of_new_sentences(sentence, max_syn_per_word = 3):
new_sentences = []
for word in word_tokenize(sentence):
if len(word)<=3 : continue
for synonym in find_synonyms(word)[0:max_syn_per_word]:
synonym = synonym.replace('_', ' ') #restore space character
new_sentence = sentence.replace(word,synonym)
new_sentences.append(new_sentence)
return new_sentences
med_records=['GRP_8','GRP_3','GRP_12','GRP_2','GRP_13','GRP_19']
low_records=['GRP_24','GRP_9','GRP_6','GRP_10','GRP_5','GRP_14','GRP_25','GRP_33','GRP_4','GRP_29','GRP_18','GRP_16','GRP_17','GRP_31','GRP_7','GRP_34','GRP_26','GRP_40','GRP_28','GRP_41'
,'GRP_15','GRP_30','GRP_42','GRP_20','GRP_45','GRP_22','GRP_1','GRP_11']
vlow_records =['GRP_21','GRP_47','GRP_23','GRP_62','GRP_48','GRP_60','GRP_39','GRP_27','GRP_37','GRP_44','GRP_36','GRP_50','GRP_53','GRP_65','GRP_53','GRP_52','GRP_55','GRP_51','GRP_59','GRP_49','GRP_46','GRP_43','GRP_66','GRP_32','GRP_63','GRP_58','GRP_56','GRP_38','GRP_68','GRP_69','GRP_57','GRP_72','GRP_71','GRP_54','GRP_35','GRP_64','GRP_70','GRP_61','GRP_67','GRP_73']
clean_data1 = clean_data[clean_data["Assignment group"].isin(med_records)]
clean_data2 = clean_data[clean_data["Assignment group"].isin(low_records)]
clean_data3 = clean_data[clean_data["Assignment group"] .isin(vlow_records)]
clean_data4 = clean_data[clean_data["Assignment group"] == 'GRP_0']
clean_data1
clean_data2
clean_data3
clean_data4
maxsyn=1
#clean_data1["Augmented_data"] = clean_data1["Translated Text"].apply(create_set_of_new_sentences)
clean_data1["Augmented_data"] = clean_data1.apply(lambda x: create_set_of_new_sentences(x['Translated Text'], maxsyn),axis=1)
clean_data1
s = clean_data1.apply(lambda x: pd.Series(x['Augmented_data']), axis=1).stack().reset_index(level=1, drop=True)
s.name = 'Final_Text'
clean_data_aug1 = clean_data1.drop(['New Description','Augmented_data', 'Clean_Description', 'Translated Text'],axis=1).join(s)
init_notebook_mode()
cf.go_offline()
# Assignment group distribution
print('\033[1mTotal assignment groups:\033[0m', clean_data_aug1['Assignment group'].nunique())
# Histogram
clean_data_aug1['Assignment group'].iplot(
kind='hist',
xTitle='Assignment Group',
yTitle='count',
title='Assignment Group Distribution- Histogram (Fig-1)')
maxsyn=6
clean_data2["Augmented_data"] = clean_data2.apply(lambda x: create_set_of_new_sentences(x['Translated Text'], maxsyn),axis=1)
clean_data2
s = clean_data2.apply(lambda x: pd.Series(x['Augmented_data']), axis=1).stack().reset_index(level=1, drop=True)
s.name = 'Final_Text'
clean_data_aug2 = clean_data2.drop(['New Description','Augmented_data', 'Clean_Description', 'Translated Text'],axis=1).join(s)
# Assignment group distribution
print('\033[1mTotal assignment groups:\033[0m', clean_data_aug2['Assignment group'].nunique())
# Histogram
clean_data_aug2['Assignment group'].iplot(
kind='hist',
xTitle='Assignment Group',
yTitle='count',
title='Assignment Group Distribution- Histogram (Fig-2)')
maxsyn=10
clean_data3["Augmented_data"] = clean_data3.apply(lambda x: create_set_of_new_sentences(x['Translated Text'], maxsyn),axis=1)
clean_data3
s = clean_data3.apply(lambda x: pd.Series(x['Augmented_data']), axis=1).stack().reset_index(level=1, drop=True)
s.name = 'Final_Text'
clean_data_aug3 = clean_data3.drop(['New Description','Augmented_data', 'Clean_Description', 'Translated Text'],axis=1).join(s)
# Assignment group distribution
print('\033[1mTotal assignment groups:\033[0m', clean_data_aug3['Assignment group'].nunique())
# Histogram
clean_data_aug3['Assignment group'].iplot(
kind='hist',
xTitle='Assignment Group',
yTitle='count',
title='Assignment Group Distribution- Histogram (Fig-4)')
maxsyn=1
clean_data4["Augmented_data"] = clean_data4.apply(lambda x: create_set_of_new_sentences(x['Translated Text'], maxsyn),axis=1)
clean_data4
s = clean_data4.apply(lambda x: pd.Series(x['Augmented_data']), axis=1).stack().reset_index(level=1, drop=True)
s.name = 'Final_Text'
clean_data_aug4 = clean_data4.drop(['New Description','Augmented_data', 'Clean_Description', 'Translated Text'],axis=1).join(s)
# Assignment group distribution
print('\033[1mTotal assignment groups:\033[0m', clean_data_aug4['Assignment group'].nunique())
# Histogram
clean_data_aug4['Assignment group'].iplot(
kind='hist',
xTitle='Assignment Group',
yTitle='count',
title='Assignment Group Distribution- Histogram (Fig-5)')
clean_data_mod4 = clean_data4.drop(['New Description', 'Clean_Description'],axis=1) clean_data_mod4.rename(columns={'Translated Text': 'Final_Text'}, inplace=True) clean_data_mod4.head()
dataframes=[clean_data_aug1,clean_data_aug2,clean_data_aug3,clean_data_aug4]
clean_data_result= pd.concat(dataframes)
# Assignment group distribution
print('\033[1mTotal assignment groups:\033[0m', clean_data_result['Assignment group'].nunique())
# Histogram
clean_data_result['Assignment group'].iplot(
kind='hist',
xTitle='Assignment Group',
yTitle='count',
title='Assignment Group Distribution- Histogram (Fig-5)')
#Stop words removal
nltk.download('stopwords')
from nltk.corpus import stopwords
sr = stopwords.words('english')
for i,text in enumerate(clean_data_result['Final_Text']):
clean_data_result['Final_Text'][i]=" ".join(word for word in text.split(' ') if word not in sr)
clean_data_result
#Lemmatisation using spacy library
!pip install spacy
!pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.3.1/en_core_web_sm-2.3.1.tar.gz
# Need to run "python -m spacy download en" in anaconda prompt to avoid 'en' not found issue.
import spacy
nlp = spacy.load('en', disable=['parser', 'ner'])
allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']
def lemmatize_text(text):
doc = nlp(text)
return ' '.join([token.lemma_ for token in doc])
clean_data_result['Final_Text'] = clean_data_result['Final_Text'].apply(lemmatize_text)
clean_data_result
#!pip install goslate
'''# Define and construct the service urls
domains = ['.com','.com.au','.com.ar','.co.kr','.co.in','.co.jp','.at','.de','.ru','.ch','.fr','.es','.ae']
urls = ['http://translate.google' + domain for domain in domains]'''
"""from goslate import Goslate # Provided by Google
import random
# List of column data to consider for translation
trans_cols = ['Clean_Description']
for idx in range(clean_data.shape[0]):
# Instantiate Goslate class in each iteration
gs = Goslate(service_urls=random.choice(urls))
row_iter = gs.translate(clean_data.loc[idx, trans_cols].tolist(),
target_language='en',
source_language='auto')
clean_data.loc[idx, trans_cols] = list(row_iter)
time.sleep(30)
clean_data.tail()"""
# Serialize the translated dataset
clean_data_result.to_csv('Final_data.csv', index=False, encoding='utf_8_sig')
with open('Final_data.pkl','wb') as f:
pickle.dump(clean_data_result, f, pickle.HIGHEST_PROTOCOL)
# Load the translated pickle file
with open('final_data.pkl','rb') as f:
clean_data = pickle.load(f)
Single-variable or univariate visualization is the simplest type of visualization which consists of observations on only a single characteristic or attribute. Univariate visualization includes histogram, bar plots and line charts.
Plots how the assignments groups are scattered across the dataset. The bar chart, histogram and pie chart tells the frequency of any ticket assigned to any group OR the tickets count for each group.
# Assignment group distribution
print('\033[1mTotal assignment groups:\033[0m', clean_data['Assignment group'].nunique())
# Histogram
clean_data['Assignment group'].iplot(
kind='hist',
xTitle='Assignment Group',
yTitle='count',
title='Assignment Group Distribution- Histogram (Fig-1)')
# Pie chart
assgn_grp = pd.DataFrame(clean_data.groupby('Assignment group').size(),columns = ['Count']).reset_index()
assgn_grp.iplot(
kind='pie',
labels='Assignment group',
values='Count',
title='Assignment Group Distribution- Pie Chart (Fig-2)',
hoverinfo="label+percent+name", hole=0.25)
# Plot to visualize the percentage data distribution across different groups
sns.set(style="whitegrid")
plt.figure(figsize=(20,5))
ax = sns.countplot(x="Assignment group", data=clean_data, order=clean_data["Assignment group"].value_counts().index)
ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
for p in ax.patches:
ax.annotate(str(format(p.get_height()/len(clean_data.index)*100, '.2f')+"%"), (p.get_x() + p.get_width() / 2., p.get_height()), ha = 'center', va = 'bottom', rotation=90, xytext = (0, 10), textcoords = 'offset points')
top_20 = clean_data['Assignment group'].value_counts().nlargest(20).reset_index()
plt.figure(figsize=(12,6))
bars = plt.bar(top_20['index'],top_20['Assignment group'])
plt.title('Top 20 Assignment groups with highest number of Tickets')
plt.xlabel('Assignment Group')
plt.xticks(rotation=90)
plt.ylabel('Number of Tickets')
for bar in bars:
yval = bar.get_height()
plt.text(bar.get_x(), yval + .005, yval)
plt.tight_layout()
plt.show()
bottom_20 = clean_data['Assignment group'].value_counts().nsmallest(20).reset_index()
plt.figure(figsize=(12,6))
bars = plt.bar(bottom_20['index'],bottom_20['Assignment group'])
plt.title('Bottom 20 Assignment groups with small number of Tickets')
plt.xlabel('Assignment Group')
plt.xticks(rotation=90)
plt.ylabel('Number of Tickets')
for bar in bars:
yval = bar.get_height()
plt.text(bar.get_x(), yval + .005, yval)
plt.tight_layout()
plt.show()
Plots how the callers are associated with tickets and what are the assignment groups they most frequently raise tickets for.
# Find out top 10 callers in terms of frequency of raising tickets in the entire dataset
print('\033[1mTotal caller count:\033[0m', clean_data['Caller'].nunique())
df = pd.DataFrame(clean_data.groupby(['Caller']).size().nlargest(10), columns=['Count']).reset_index()
df.iplot(kind='pie',
labels='Caller',
values='Count',
title='Top 10 caller- Pie Chart (Fig-7)',
colorscale='-spectral',
pull=[0,0,0,0,0.05,0.1,0.15,0.2,0.25,0.3])
# Top 5 callers in each assignment group
top_n = 5
s = clean_data['Caller'].groupby(clean_data['Assignment group']).value_counts()
caller_grp = pd.DataFrame(s.groupby(level=0).nlargest(top_n).reset_index(level=0, drop=True))
caller_grp.head(15)
Plots the variation of length and word count of new description attribute
clean_data.insert(1, 'desc_len', clean_data['Final_Text'].astype(str).apply(len))
clean_data.insert(5, 'desc_word_count', clean_data['Final_Text'].apply(lambda x: len(str(x).split())))
clean_data.head()
# Description text length
clean_data['desc_len'].iplot(
kind='bar',
xTitle='text length',
yTitle='count',
colorscale='-ylgn',
title='Description Text Length Distribution (Fig-11)')
# Description word count
clean_data['desc_word_count'].iplot(
kind='bar',
xTitle='word count',
linecolor='black',
yTitle='count',
colorscale='-bupu',
title='Description Word Count Distribution (Fig-12)')
N-gram is a contiguous sequence of N items from a given sample of text or speech, in the fields of computational linguistics and probability. The items can be phonemes, syllables, letters, words or base pairs according to the application. N-grams are used to describe the number of words used as observation points, e.g., unigram means singly-worded, bigram means 2-worded phrase, and trigram means 3-worded phrase.
We'll be using scikit-learn’s CountVectorizer function to derive n-grams and compare them before and after removing stop words. Stop words are a set of commonly used words in any language. We'll be using english corpus stopwords and extend it to include some business specific common words considered to be stop words in our case.
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from sklearn.feature_extraction.text import CountVectorizer
# Extend the English Stop Wordss
STOP_WORDS = STOPWORDS.union({'yes','na','hi',
'receive','hello',
'regards','thanks',
'from','greeting',
'forward','reply',
'will','please',
'see','help','able'})
# Generic function to derive top N n-grams from the corpus
def get_top_n_ngrams(corpus, top_n=None, ngram_range=(1,1), stopwords=None):
vec = CountVectorizer(ngram_range=ngram_range,
stop_words=stopwords).fit(corpus)
bag_of_words = vec.transform(corpus)
sum_words = bag_of_words.sum(axis=0)
words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
words_freq = sorted(words_freq, key = lambda x: x[1], reverse=True)
return words_freq[:top_n]
# Top 50 Unigrams before removing stop words
top_n = 50
ngram_range = (1,1)
uni_grams = get_top_n_ngrams(clean_data.Final_Text, top_n, ngram_range)
df = pd.DataFrame(uni_grams, columns = ['Final_Text' , 'count'])
df.groupby('Final_Text').sum()['count'].sort_values(ascending=False).iplot(
kind='bar',
yTitle='Count',
linecolor='black',
colorscale='piyg',
title=f'Top {top_n} Unigrams in Final_Text')
# Top 50 Unigrams after removing stop words
uni_grams_sw = get_top_n_ngrams(clean_data.Final_Text, top_n, ngram_range, stopwords=STOP_WORDS)
df = pd.DataFrame(uni_grams_sw, columns = ['Final_Text' , 'count'])
df.groupby('Final_Text').sum()['count'].sort_values(ascending=False).iplot(
kind='bar',
yTitle='Count',
linecolor='black',
colorscale='-piyg',
title=f'Top {top_n} Unigrams in Final_Text without stop words')
# Top 50 Bigrams before removing stop words
top_n = 50
ngram_range = (2,2)
bi_grams = get_top_n_ngrams(clean_data.Final_Text, top_n, ngram_range)
df = pd.DataFrame(bi_grams, columns = ['Final_Text' , 'count'])
df.groupby('Final_Text').sum()['count'].sort_values(ascending=False).iplot(
kind='bar',
yTitle='Count',
linecolor='black',
colorscale='piyg',
title=f'Top {top_n} Bigrams in Final_Text')
# Top 50 Bigrams after removing stop words
bi_grams_sw = get_top_n_ngrams(clean_data.Final_Text, top_n, ngram_range, stopwords=STOP_WORDS)
df = pd.DataFrame(bi_grams_sw, columns = ['Final_Text' , 'count'])
df.groupby('Final_Text').sum()['count'].sort_values(ascending=False).iplot(
kind='bar',
yTitle='Count',
linecolor='black',
colorscale='-piyg',
title=f'Top {top_n} Bigrams in Final_Text without stop words')
# Top 50 Trigrams before removing stop words
top_n = 50
ngram_range = (3,3)
tri_grams = get_top_n_ngrams(clean_data.Final_Text, top_n, ngram_range)
df = pd.DataFrame(tri_grams, columns = ['Final_Text' , 'count'])
df.groupby('Final_Text').sum()['count'].sort_values(ascending=False).iplot(
kind='bar',
yTitle='Count',
linecolor='black',
colorscale='piyg',
title=f'Top {top_n} Trigrams in Final_Text')
# Top 50 Trigrams after removing stop words
tri_grams_sw = get_top_n_ngrams(clean_data.Final_Text, top_n, ngram_range, stopwords=STOP_WORDS)
df = pd.DataFrame(tri_grams_sw, columns = ['Final_Text' , 'count'])
df.groupby('Final_Text').sum()['count'].sort_values(ascending=False).iplot(
kind='bar',
yTitle='Count',
linecolor='black',
colorscale='-piyg',
title=f'Top {top_n} Trigrams in Final_Text without stop words')
Let us attempt to visualize this as a word cloud for top three groups that has got maximum records. A word cloud enables us to visualize the data as cluster of words and each words displayed in different font size based on the number of occurences of that word . Basically; the bolder and bigger the word show up in the visualization, it implies its more often it’s mentioned within a given text compared to other words in the cloud and therefore would be more important for us.
Let's write a generic method to generate Word Clouds for both Short and Long Description columns.
def generate_word_cloud(corpus):
# Instantiate the wordcloud object
wordcloud = WordCloud(width = 800, height = 800,
background_color ='white',
stopwords=STOP_WORDS,
# mask=mask,
min_font_size = 10).generate(corpus)
# plot the WordCloud image
plt.figure(figsize = (12, 12), facecolor = None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)
plt.show()
# Word Cloud for all tickets assigned to GRP_0
generate_word_cloud(' '.join(clean_data[clean_data['Assignment group'] == 'GRP_0'].Final_Text.str.strip()))
# Word Cloud for all tickets assigned to GRP_8
generate_word_cloud(' '.join(clean_data[clean_data['Assignment group'] == 'GRP_8'].Final_Text.str.strip()))
# Word Cloud for all tickets assigned to GRP_25
generate_word_cloud(' '.join(clean_data[clean_data['Assignment group'] == 'GRP_25'].Final_Text.str.strip()))
# Generate wordcloud for Final_Text field
generate_word_cloud(' '.join(clean_data.Final_Text.str.strip()))
# Import label encoder
from sklearn import preprocessing
# label_encoder object knows how to understand word labels.
label_encoder = preprocessing.LabelEncoder()
# Encode labels in column 'species'.
clean_data['Assignment group LabelEncoded']= label_encoder.fit_transform(clean_data['Assignment group'])
clean_data['Assignment group LabelEncoded'].unique()
label_encoded_dict = dict(zip(clean_data['Assignment group'].unique(), clean_data['Assignment group LabelEncoded'].unique()))
len(label_encoded_dict)
from sklearn.feature_extraction.text import CountVectorizer
CV = CountVectorizer()
X_BoW = CV.fit_transform(clean_data['Final_Text']).toarray()
y = clean_data['Assignment group LabelEncoded']
print("Shape of Input Feature :",np.shape(X_BoW))
print("Shape of Target Feature :",np.shape(y))
# Splitting Train Test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_BoW, y, test_size=0.3, random_state = 1)
print('\033[1mShape of the training set:\033[0m', X_train.shape, X_test.shape)
print('\033[1mShape of the test set:\033[0m', y_train.shape, y_test.shape)
def run_classification(estimator, X_train, X_test, y_train, y_test, arch_name=None, pipelineRequired=True, isDeepModel=False):
# train the model
clf = estimator
if pipelineRequired :
clf = Pipeline([('tfidf', TfidfTransformer()),
('clf', estimator),
])
if isDeepModel :
clf.fit(X_train, y_train, validation_data=(X_test, y_test),epochs=10, batch_size=128,verbose=1,callbacks=call_backs(arch_name))
# predict from the clasiffier
y_pred = clf.predict(X_test)
y_pred = np.argmax(y_pred, axis=1)
y_train_pred = clf.predict(X_train)
y_train_pred = np.argmax(y_train_pred, axis=1)
else :
clf.fit(X_train, y_train)
# predict from the clasiffier
y_pred = clf.predict(X_test)
y_train_pred = clf.predict(X_train)
print('Estimator:', clf)
print('='*80)
print('Training accuracy: %.2f%%' % (accuracy_score(y_train,y_train_pred) * 100))
print('Testing accuracy: %.2f%%' % (accuracy_score(y_test, y_pred) * 100))
print('='*80)
print('Confusion matrix:\n %s' % (confusion_matrix(y_test, y_pred)))
print('='*80)
print('Classification report:\n %s' % (classification_report(y_test, y_pred)))
run_classification(LogisticRegression(), X_train, X_test, y_train, y_test)
run_classification(MultinomialNB(), X_train, X_test, y_train, y_test)
run_classification(KNeighborsClassifier(), X_train, X_test, y_train, y_test)
run_classification(LinearSVC(), X_train, X_test, y_train, y_test)
run_classification(DecisionTreeClassifier(), X_train, X_test, y_train, y_test)
run_classification(RandomForestClassifier(n_estimators=100), X_train, X_test, y_train, y_test)